### Load required libraries
library(readxl)
library(tidyverse)
library(dplyr)
library(knitr)
library(DT)
library(httr)
library(jsonlite)

### Load functions
source(file = "../R/02_functions.R")

### Define paths
current_dir <- getwd()

# MAF-like file
maf_data <- "../data/_raw/41467_2017_1460_MOESM6_ESM_somatic_mutations.xlsx"
maf_path <- file.path(current_dir, 
                      maf_data)

### Read data
maf_df <- read_excel(maf_path,
                        skip=1,
                        col_names=TRUE)

0.1 See mutations per patient

unique_tumor_counts <- maf_df %>%
  dplyr::count(tumor_name, 
        sort = TRUE)  # Count unique values and sort

# Display the result
datatable(unique_tumor_counts, 
          extensions = 'Buttons', 
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'csv'),
            scrollX=TRUE,
            pageLength=10,
            columnDefs = list(list(
              targets = "_all",
              render = JS(
                "function(data, type, row, meta) {",
                "  return data === null ? 'NA' : data;",
                "}"
              )
            ))
          ),
          caption = "Mutation counts per patient"
        )

0.2 Pre-processing

# Filter rows where Entrez_Gene_Id is 0
missing_entrez_df <- maf_df %>%
  filter(Entrez_Gene_Id == 0)

# Filter rows where Hugo_Symbol is Unknown
missing_hugo_df <- maf_df %>%
  filter(Hugo_Symbol == 'Unknown')

n_rows_1 <- nrow(missing_entrez_df)
n_rows_2 <- nrow(missing_hugo_df)

print(paste("Number of samples missing an Entrez ID:", n_rows_1))
## [1] "Number of samples missing an Entrez ID: 172"
print(paste("Number of samples missing Hugo Symbol:", n_rows_2))
## [1] "Number of samples missing Hugo Symbol: 957"

0.3 Export info to use Ensembl VEP (web version)

### Extract info in correct format for VEP input
api_input <- maf_df %>% 
  select(Chromosome,
         Start_position,
         End_position,
         ref_allele,
         alt_allele,
         Strand) %>% 
  mutate(Allele_ref_alt = paste(ref_allele, 
                                alt_allele, 
                                sep = "/")) %>% 
  mutate(API_info = paste(Chromosome,
                          Start_position,
                          End_position,
                          Allele_ref_alt,
                          Strand,
                          sep = " "))

### Export the info to a file and use the Web VEP
vep_input_data <- "../data/vep_input_grch37.txt"
vep_file_path <- file.path(current_dir, 
                      vep_input_data)
write.table(api_input$API_info, file = vep_file_path, row.names = FALSE, col.names = FALSE, quote = FALSE)

0.4 Run VEP and explore output

### Define path
vep_out_data <- "../data/vep_output_grch37_01.txt"
vep_path <- file.path(current_dir,
                      vep_out_data)

### Read data
vep_df <- read.csv(vep_path, header = TRUE, sep = "\t")

### Display result
datatable(vep_df, 
          extensions = 'Buttons', 
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'csv'),
            scrollX=TRUE,
            pageLength=10,
            columnDefs = list(list(
              targets = "_all",
              render = JS(
                "function(data, type, row, meta) {",
                "  return data === null ? 'NA' : data;",
                "}"
              )
            ))
          ),
          caption = "VEP output"
        )

0.5 Session Info

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.7.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Copenhagen
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] jsonlite_1.8.8  httr_1.4.7      DT_0.31         knitr_1.45     
##  [5] lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
##  [9] purrr_1.0.2     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
## [13] ggplot2_3.4.4   tidyverse_2.0.0 readxl_1.4.3   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4      compiler_4.3.2    tidyselect_1.2.0  jquerylib_0.1.4  
##  [5] scales_1.3.0      yaml_2.3.8        fastmap_1.1.1     R6_2.5.1         
##  [9] generics_0.1.3    htmlwidgets_1.6.4 munsell_0.5.0     tzdb_0.4.0       
## [13] bslib_0.6.1       pillar_1.9.0      rlang_1.1.3       utf8_1.2.4       
## [17] stringi_1.8.3     cachem_1.0.8      xfun_0.41         sass_0.4.8       
## [21] timechange_0.3.0  cli_3.6.2         withr_3.0.0       magrittr_2.0.3   
## [25] crosstalk_1.2.1   digest_0.6.34     grid_4.3.2        rstudioapi_0.15.0
## [29] hms_1.1.3         lifecycle_1.0.4   vctrs_0.6.5       evaluate_0.23    
## [33] glue_1.7.0        cellranger_1.1.0  fansi_1.0.6       colorspace_2.1-0 
## [37] rmarkdown_2.25    ellipsis_0.3.2    tools_4.3.2       pkgconfig_2.0.3  
## [41] htmltools_0.5.7